source("Scripts/libraries.R")
source("Scripts/variables.R")
source("Scripts/functions.R")

1 Examples of user history

rm(list=ls())
source("Scripts/libraries.R")
source("Scripts/variables.R")
source("Scripts/functions.R")

1.1 Kindara

1.2 Sympto

load(paste0(IO$sympto_data_02_processed,'cycledays.Rdata'), verbose = TRUE)
## Loading objects:
##   cycledays
agg = aggregate(cycle_nb ~ user_id, cycledays,lu)
j = which(agg$cycle_nb >= 50)

user_ids = agg$user_id[j]
rm(j, agg)
for(u in user_ids[1:10]){
    k = which(cycledays$user_id == u)
    plot.tracking.history(d = cycledays[k,], show_goal = TRUE, relative_date = TRUE)
}

rm(u, k)
u = 1226
k = which(cycledays$user_id == u)
plot.tracking.history(d = cycledays[k,], show_goal = TRUE, relative_date = TRUE)

cycledays_sympto_user_history = cycledays[k,]

save(cycledays_sympto_user_history,file = paste0(IO$restricted_figure_data,"history_sympto.Rdata"))

rm(u, k, cycledays_sympto_user_history, cycledays, user_ids)

2 Users demographics

2.1 Kindara

source("Scripts/_setup_Kindara.R")
rm(list=ls())
source("Scripts/libraries.R")
source("Scripts/variables.R")
source("Scripts/functions.R")
load(file = paste0(IO$kindara_data_accounts,'processed_cleaned_accounts_onefile.Rdata'), verbose = TRUE)
## Loading objects:
##   accounts
round(mean(accounts$age_registration, na.rm = TRUE), digits = 1)
## [1] 29.2
round(sd(accounts$age_registration, na.rm = TRUE), digits = 1)
## [1] 5.9
round(100 - (sum(is.na(accounts$age_registration))/nrow(accounts)*100))
## [1] 13
users_demo_kindara = accounts[,c("age_registration","age_now")]
save(users_demo_kindara,file = paste0(IO$restricted_figure_data,"users_demo_kindara.Rdata"))
rm(accounts, users_demo_kindara)

2.2 Sympto

source("Scripts/_setup_Sympto.R")
load(paste0(IO$sympto_data_02_processed,'users.Rdata'), verbose = TRUE)
## Loading objects:
##   users
users_demo = users[,c("age_registration","age_now","menarche_year","cm","kg","bmi")]

users_demo$cm_o = users_demo$cm
users_demo$cm[which(users_demo$cm < 100)] = 100 + users_demo$cm[which(users_demo$cm < 100)] 

users_demo$bmi_o = users_demo$bmi
users_demo$bmi = users$kg / ((users_demo$cm/100)^2)


summary(users_demo)
##  age_registration    age_now      menarche_year         cm       
##  Min.   : 0.00    Min.   : 0.00   Min.   : 8.00   Min.   :100.0  
##  1st Qu.:26.00    1st Qu.:29.00   1st Qu.:12.00   1st Qu.:160.0  
##  Median :29.00    Median :33.00   Median :13.00   Median :165.0  
##  Mean   :29.94    Mean   :33.52   Mean   :12.85   Mean   :165.3  
##  3rd Qu.:34.00    3rd Qu.:38.00   3rd Qu.:14.00   3rd Qu.:170.0  
##  Max.   :74.00    Max.   :77.00   Max.   :18.00   Max.   :229.0  
##  NA's   :2570     NA's   :2570    NA's   :2819    NA's   :2735   
##        kg              bmi               cm_o           bmi_o       
##  Min.   : 20.00   Min.   :  7.703   Min.   : 50.0   Min.   : 7.703  
##  1st Qu.: 54.00   1st Qu.: 19.818   1st Qu.:160.0   1st Qu.:19.818  
##  Median : 60.00   Median : 21.631   Median :165.0   Median :21.630  
##  Mean   : 62.23   Mean   : 22.788   Mean   :161.3   Mean   :22.840  
##  3rd Qu.: 67.00   3rd Qu.: 24.342   3rd Qu.:170.0   3rd Qu.:24.314  
##  Max.   :149.00   Max.   :103.939   Max.   :229.0   Max.   :86.565  
##  NA's   :2765     NA's   :2786      NA's   :2735    NA's   :3214
round(apply(users_demo, 2, sd, na.rm = TRUE), digits = 2)
## age_registration          age_now    menarche_year               cm 
##             6.36             6.69             1.61             6.73 
##               kg              bmi             cm_o            bmi_o 
##            13.14             4.76            21.25             5.02
100 - round(apply(users_demo, 2, function(x) sum(is.na(x)))/nrow(users)*100)
## age_registration          age_now    menarche_year               cm 
##               81               81               79               80 
##               kg              bmi             cm_o            bmi_o 
##               80               80               80               76
users_demo_sympto = users_demo
save(users_demo_sympto,file = paste0(IO$restricted_figure_data,"users_demo_sympto.Rdata"))

rm(users, users_demo, users_demo_sympto)

3 Tracking frequency

nothing to be done - everything is in the FAM_figures.Rmd

4 Overall observation profiles

4.1 Kindara

source("Scripts/_setup_Kindara.R")
folder = paste0(IO$kindara_data_days,"03 selected/")
days.file.list = list.files(folder)
days.file.list = paste0(folder,days.file.list )

rm(folder)

4.1.1 Temperature

cycleday_from_end = -28:-1

breaks = seq(-0.6,2,by = 0.1)-0.05
mids = breaks[2:length(breaks)]-0.05

registerDoParallel(par$n_cores)

tic()
temp.hist = foreach(days.file = days.file.list,.combine = combine.sum) %dopar% 
{hist.by.cycleday(file = days.file, 
                  attr = 'temp_diff_from_p25', 
                  breaks = breaks, 
                  from_start = FALSE,
                  cycleday_v = cycleday_from_end)}
toc()
## 42.615 sec elapsed
colnames(temp.hist) = cycleday_from_end

n.tot = colSums(temp.hist)
temp.hist.norm = t(t(temp.hist)/ n.tot )


delta_BBT_distribution_kindara = data.frame(cycleday = cycleday_from_end, 
                                            d = t(temp.hist.norm))
colnames(delta_BBT_distribution_kindara) = c('cycleday',round(mids, digits = 2))
  
write.csv(delta_BBT_distribution_kindara, 
          file = paste0(IO$output_csv, 'delta_BBT_distribution_kindara.csv'),
          row.names = FALSE)

save(delta_BBT_distribution_kindara, file = paste0(IO$output_Rdata, "delta_BBT_distribution_kindara.Rdata"))


n.cum = apply(temp.hist, 2, cumsum)

n.med = sapply(X = 1:length(cycleday_from_end),which.quant, pc = 0.5, n.cum = n.cum, n.tot = n.tot)
med = mids[n.med]

n.10 = sapply(X = 1:length(cycleday_from_end),which.quant, pc = 0.1, n.cum = n.cum, n.tot = n.tot)
n.25 = sapply(X = 1:length(cycleday_from_end),which.quant, pc = 0.25, n.cum = n.cum, n.tot = n.tot)
n.75 = sapply(X = 1:length(cycleday_from_end),which.quant, pc = 0.75, n.cum = n.cum, n.tot = n.tot)
n.90 = sapply(X = 1:length(cycleday_from_end),which.quant, pc = 0.9, n.cum = n.cum, n.tot = n.tot)

t.10 = mids[n.10]
t.25 = mids[n.25]
t.75 = mids[n.75]
t.90 = mids[n.90]

delta_BBT_summary_kindara = data.frame(cycleday_from_end = cycleday_from_end, 
                                            p.10 = round(t.10, digits = 2),
                                            p.25 = round(t.25, digits = 2),
                                            median = round(med, digits = 2),
                                            p.75 = round(t.75, digits = 2),
                                            p.90 = round(t.90, digits = 2))
write.csv(delta_BBT_summary_kindara, 
          file = paste0(IO$output_csv, 'delta_BBT_summary_kindara.csv'),
          row.names = FALSE)

save(delta_BBT_summary_kindara, file = paste0(IO$output_Rdata, "delta_BBT_summary_kindara.Rdata"))



rm(delta_BBT_summary_kindara, t.90, t.75, t.25, t.10, n.90, n.75, n.25,n.10, med, n.med,delta_BBT_distribution_kindara,temp.hist.norm, n.tot,mids, breaks, cycleday_from_end, n.cum, temp.hist)

4.1.2 Bleeding

breaks = c(-0.25,dict$bleeding$index +0.25)

cycleday_from_end = -15:-1

cycleday = 1:15

tic()
bleeding.hist.end = foreach(days.file = days.file.list,.combine = combine.sum) %dopar% 
{hist.by.cycleday(file = days.file, 
                  attr = 'bleeding', 
                  breaks = breaks, 
                  from_start = FALSE,
                  cycleday_v = cycleday_from_end)}
toc()
## 18.849 sec elapsed
tic()
bleeding.hist.start = foreach(days.file = days.file.list,.combine = combine.sum) %dopar% 
{hist.by.cycleday(file = days.file, 
                  attr = 'bleeding', 
                  breaks = breaks, 
                  from_start = TRUE,
                  cycleday_v = cycleday)}
toc()
## 17.946 sec elapsed
n.cycles = max(apply(bleeding.hist.start,2,sum))

bleeding.hist.start.norm = bleeding.hist.start / n.cycles
bleeding.hist.end.norm = bleeding.hist.end / n.cycles

bleeding_distribution_kindara = data.frame(
  cycleday = c(cycleday_from_end, cycleday),
  no_bleeding_reported = c(as.numeric(bleeding.hist.end.norm[1,]),as.numeric(bleeding.hist.start.norm[1,])),
  spotting = c(as.numeric(bleeding.hist.end.norm[2,]),as.numeric(bleeding.hist.start.norm[2,])),
  light_bleeding = c(as.numeric(bleeding.hist.end.norm[3,]),as.numeric(bleeding.hist.start.norm[3,])),
  medium_bleeding = c(as.numeric(bleeding.hist.end.norm[4,]),as.numeric(bleeding.hist.start.norm[4,])),
  heavy_bleeding = c(as.numeric(bleeding.hist.end.norm[5,]),as.numeric(bleeding.hist.start.norm[5,]))
)


write.csv(bleeding_distribution_kindara, 
          file = paste0(IO$output_csv, 'bleeding_distribution_kindara.csv'),
          row.names = FALSE)

save(bleeding_distribution_kindara, file = paste0(IO$output_Rdata, "bleeding_distribution_kindara.Rdata"))

rm(bleeding.hist.start.norm, bleeding.hist.end.norm, bleeding.hist.start, bleeding.hist.end, cycleday_from_end, cycleday, bleeding_distribution_kindara)

4.1.3 Mucus

breaks = 0:13-0.5
mids = breaks[2:length(breaks)]-0.5
cycleday_from_end = -28:-1

tic()
mucus.hist = foreach(days.file = days.file.list,.combine = combine.sum) %dopar% 
{hist.by.cycleday(file = days.file, 
                  attr = 'mucus', 
                  breaks = breaks, 
                  from_start = FALSE,
                  cycleday_v = cycleday_from_end)}
toc()
## 19.622 sec elapsed
mucus.hist.norm = mucus.hist/n.cycles

cervical_mucus_distribution_kindara = data.frame(cycleday = cycleday_from_end, 
                                            m = t(mucus.hist.norm))
colnames(cervical_mucus_distribution_kindara) = c('cycleday',dict$mucus$names[2:14])


write.csv(cervical_mucus_distribution_kindara, 
          file = paste0(IO$output_csv, 'cervical_mucus_distribution_kindara.csv'),
          row.names = FALSE)

save(cervical_mucus_distribution_kindara, file = paste0(IO$output_Rdata, "cervical_mucus_distribution_kindara.Rdata"))


rm(mucus.hist, mucus.hist.norm, breaks, mids, cycleday_from_end, cervical_mucus_distribution_kindara)

4.1.4 Cervix

breaks = 1:4-0.5
cycleday_from_end = -28:-1

tic()
cervix.openness.hist = foreach(days.file = days.file.list,.combine = combine.sum) %dopar% 
{hist.by.cycleday(file = days.file, 
                  attr = 'cervix_openness', 
                  breaks = breaks, 
                  from_start = FALSE,
                  cycleday_v = cycleday_from_end)}
toc()
## 18.239 sec elapsed
cervix.openness.hist.norm = cervix.openness.hist/n.cycles


tic()
cervix.height.hist = foreach(days.file = days.file.list,.combine = combine.sum) %dopar% 
{hist.by.cycleday(file = days.file, 
                  attr = 'cervix_height', 
                  breaks = breaks, 
                  from_start = FALSE,
                  cycleday_v = cycleday_from_end)}
toc()
## 16.812 sec elapsed
cervix.height.hist.norm = cervix.height.hist/n.cycles


tic()
cervix.firmness.hist = foreach(days.file = days.file.list,.combine = combine.sum) %dopar% 
{hist.by.cycleday(file = days.file, 
                  attr = 'cervix_firmness', 
                  breaks = breaks, 
                  from_start = FALSE,
                  cycleday_v = cycleday_from_end)}
toc()
## 19.475 sec elapsed
cervix.firmness.hist.norm = cervix.firmness.hist/n.cycles


cervix_distribution_kindara = data.frame(cycleday = cycleday_from_end,
                                         o = t(cervix.openness.hist.norm),
                                         f = t(cervix.firmness.hist.norm),
                                         h = t(cervix.height.hist.norm)
                                         )
colnames(cervix_distribution_kindara) = c('cycleday','o_closed','o_medium','o_open',
                                          'f_firm','f_medium','f_soft',
                                          'h_low','h_medium','h_high')


write.csv(cervix_distribution_kindara, 
          file = paste0(IO$output_csv, 'cervix_distribution_kindara.csv'),
          row.names = FALSE)

save(cervix_distribution_kindara, file = paste0(IO$output_Rdata, "cervix_distribution_kindara.Rdata"))


rm(cervix_distribution_kindara, cervix.firmness.hist.norm, cervix.firmness.hist, cervix.height.hist.norm, cervix.height.hist, cervix.openness.hist.norm, cervix.openness.hist, breaks, cycleday_from_end )

4.1.5 Vaginal Sensation

breaks = 1:5-0.5
cycleday_from_end = -28:-1

tic()
vaginal_sensation.hist = foreach(days.file = days.file.list,.combine = combine.sum) %dopar% 
{hist.by.cycleday(file = days.file, 
                  attr = 'vaginal_sensation', 
                  breaks = breaks, 
                  from_start = FALSE,
                  cycleday_v = cycleday_from_end)}
toc()
## 17.022 sec elapsed
vaginal_sensation.hist.norm = vaginal_sensation.hist/n.cycles


vaginal_sensation_distribution_kindara = data.frame(cycleday = cycleday_from_end,
                                         s = t(vaginal_sensation.hist.norm))

colnames(vaginal_sensation_distribution_kindara) = c('cycleday',dict$feel$names[3:6])


write.csv(vaginal_sensation_distribution_kindara, 
          file = paste0(IO$output_csv, 'vaginal_sensation_distribution_kindara.csv'),
          row.names = FALSE)

save(vaginal_sensation_distribution_kindara, file = paste0(IO$output_Rdata, "vaginal_sensation_distribution_kindara.Rdata"))


rm(vaginal_sensation_distribution_kindara, vaginal_sensation.hist, vaginal_sensation.hist.norm, breaks, cycleday_from_end)
rm(days.file.list, n.cycles)

4.2 Sympto

source("Scripts/_setup_Sympto.R")
load(paste0(IO$sympto_data_02_standard_cycles_with_HMM_res,"cycledays.Rdata"), verbose = TRUE)
## Loading objects:
##   cycledays

4.2.1 Temperature

CC = cycledays

n.days.in.cycle = 28
i.end = which(CC$cycleday_from_end >= -n.days.in.cycle)
i.start = which(CC$out_cycleday <= n.days.in.cycle)

cc = CC[i.end,]

uu = unique(cbind(cc$cycleday_from_end, cc$low_norm_temp), by = c('cycleday_from_end','low_norm_temp'))
uuu = aggregate(cc[,c('cycleday_from_end','low_norm_temp')], by = list(cc$cycleday_from_end,cc$low_norm_temp), length)
uuu = uuu[,1:3]
colnames(uuu) = c('cycleday_from_end','low_norm_temp','n')

uuu$density = uuu$n/max(uuu$n)

delta_BBT_distribution_sympto = uuu[order(uuu$cycleday_from_end,uuu$low_norm_temp),-3]

write.csv(delta_BBT_distribution_sympto, 
          file = paste0(IO$output_csv, 'delta_BBT_distribution_sympto.csv'),
          row.names = FALSE)

save(delta_BBT_distribution_sympto, file = paste0(IO$output_Rdata,"delta_BBT_distribution_sympto.Rdata") )

quant = aggregate(low_norm_temp ~ cycleday_from_end, cc, quantile, seq(0,1,by=0.05))

delta_BBT_summary_sympto = data.frame(cycleday =  quant$cycleday_from_end,
                                      p.10 = round(quant$low_norm_temp[,3], digits = 3),
                                      p.25 = round(quant$low_norm_temp[,6], digits = 3),
                                      median = round(quant$low_norm_temp[,11], digits = 3),
                                      p.75 = round(quant$low_norm_temp[,16], digits = 3),
                                      p.90 = round(quant$low_norm_temp[,19], digits = 3)
                                      )

write.csv(delta_BBT_summary_sympto, 
          file = paste0(IO$output_csv, 'delta_BBT_summary_sympto.csv'),
          row.names = FALSE)

save(delta_BBT_summary_sympto, file = paste0(IO$output_Rdata,"delta_BBT_summary_sympto.Rdata") )

rm(delta_BBT_distribution_sympto, delta_BBT_summary_sympto, quant, uu, uuu, cc, i.end, i.start, n.days.in.cycle)

4.2.2 Bleeding

obs = 'b'
n.days.in.cycle = 15
i.end = which(CC$cycleday_from_end >= -n.days.in.cycle)
i.start = which(CC$out_cycleday <= n.days.in.cycle)

#bleeding from start

cc = CC[i.start,]
N = length(unique(cc$cycle_id))
breaks = seq(0.5, n.days.in.cycle+0.5, by = 1)
for(b in c(1,2,3)){
  h = hist(cc$out_cycleday[cc$blood == b], breaks = breaks, plot = FALSE)
  res = data.frame(out_cycleday = h$mids, blood = paste0(obs,'.',b),freq = h$counts/N )
  if(b == 1){resB = res}else{resB = rbind(resB, res)}
}

resB$blood = factor(resB$blood)
resB = cast(resB, out_cycleday ~ blood )
## Using freq as value column.  Use the value argument to cast to override this choice
colnames(resB) = c('cycleday','light_bleeding','medium_bleeding','high_bleeding')

resB.start = resB

#bleeding from end
cc = CC[i.end,]
N = length(unique(cc$cycle_id))
breaks = seq(-n.days.in.cycle-0.5, -0.5, by = 1)
for(b in c(1,2,3)){
  h = hist(cc$cycleday_from_end[cc$blood == b], breaks = breaks, plot = FALSE)
  res = data.frame(out_cycleday = h$mids, blood = paste0(obs,'.',b),freq = h$counts/N)
  if(b == 1){resB = res}else{resB = rbind(resB, res)}
}

resB$blood = factor(resB$blood)
resB = cast(resB, out_cycleday ~ blood )
## Using freq as value column.  Use the value argument to cast to override this choice
colnames(resB) = c('cycleday','light_bleeding','medium_bleeding','high_bleeding')

resB.end = resB

resB = rbind(resB.end, resB.start)

bleeding_distribution_sympto = resB

write.csv(bleeding_distribution_sympto, 
          file = paste0(IO$output_csv, 'bleeding_distribution_sympto.csv'),
          row.names = FALSE)
save(bleeding_distribution_sympto, file = paste0(IO$output_Rdata, 'bleeding_distribution_sympto.Rdata'))

rm(res, resB, resB.end, resB.start, bleeding_distribution_sympto, N, b, n.days.in.cycle, i.start, i.end, h, cc, obs, breaks)

4.2.3 Mucus

n.days.in.cycle = 28
i.end = which(CC$cycleday_from_end >= -n.days.in.cycle)
i.start = which(CC$out_cycleday <= n.days.in.cycle)

obs = 'm'

#mucus from end
cc = CC[i.end,]
N = length(unique(cc$cycle_id))
breaks = seq(-n.days.in.cycle-0.5, -0.5, by = 1)
for(m in c(1,2,3,4)){
  h = hist(cc$cycleday_from_end[cc$elixir == m], breaks = breaks, plot = FALSE)
  res = data.frame(out_cycleday = h$mids, mucus = paste0(obs,'.',m),freq = h$counts/N )
  if(m == 1){resM = res}else{resM = rbind(resM, res)}
}

resM$mucus = factor(resM$mucus )
resM = cast(resM, out_cycleday ~ mucus )
## Using freq as value column.  Use the value argument to cast to override this choice
colnames(resM) = c('cycleday',dict$mucus$hmm.symbols[2:5])


cervical_mucus_distribution_sympto = resM


write.csv(cervical_mucus_distribution_sympto, 
          file = paste0(IO$output_csv, 'cervical_mucus_distribution_sympto.csv'),
          row.names = FALSE)


save(cervical_mucus_distribution_sympto, file = paste0(IO$output_Rdata, 'cervical_mucus_distribution_sympto.Rdata'))

rm(res, resM, cervical_mucus_distribution_sympto, N, m, n.days.in.cycle, i.start, i.end, h, cc, obs, breaks)

4.2.4 Cervix

obs = 'c'
n.days.in.cycle = 28
i.end = which(CC$cycleday_from_end >= -n.days.in.cycle)
i.start = which(CC$out_cycleday <= n.days.in.cycle)


cc = CC[i.end,]
N = length(unique(cc$cycle_id))
breaks = seq(-n.days.in.cycle-0.5, -0.5, by = 1)
for(c in c(11,12,13)){
  h = hist(cc$cycleday_from_end[cc$feel == c], breaks = breaks, plot = FALSE)
  res = data.frame(out_cycleday = h$mids, cervix = paste0(obs,'.',c),freq = h$counts/N )
  if(c == 11){resC = res}else{resC = rbind(resC, res)}
}

resC$cervix = factor(resC$cervix )
resC = cast(resC, out_cycleday ~ cervix )
## Using freq as value column.  Use the value argument to cast to override this choice
colnames(resC) = c('cycleday',dict$cervix$names[5:7])

cervix_distribution_sympto = resC

write.csv(cervix_distribution_sympto, 
          file = paste0(IO$output_csv, 'cervix_distribution_sympto.csv'),
          row.names = FALSE)

save(cervix_distribution_sympto, file = paste0(IO$output_Rdata, 'cervix_distribution_sympto.Rdata'))

rm(res, resC, cervix_distribution_sympto, N, c, n.days.in.cycle, i.start, i.end, h, cc, obs, breaks)

4.2.5 Vaginal Feel

obs = "c"
n.days.in.cycle = 28
i.end = which(CC$cycleday_from_end >= -n.days.in.cycle)
i.start = which(CC$out_cycleday <= n.days.in.cycle)

cc = CC[i.end,]
N = length(unique(cc$cycle_id))
breaks = seq(-n.days.in.cycle-0.5, -0.5, by = 1)
for(c in c(1,2,3)){
  h = hist(cc$cycleday_from_end[cc$feel == c], breaks = breaks, plot = FALSE)
  res = data.frame(out_cycleday = h$mids, cervix = paste0(obs,'.',c),freq = h$counts/N )
  if(c == 1){resC = res}else{resC = rbind(resC, res)}
}

resC$cervix = factor(resC$cervix )
resC = cast(resC, out_cycleday ~ cervix )
## Using freq as value column.  Use the value argument to cast to override this choice
colnames(resC) = c('cycleday',dict$cervix$names[2:4])

vaginal_sensation_distribution_sympto = resC 

write.csv(vaginal_sensation_distribution_sympto, 
          file = paste0(IO$output_csv, 'vaginal_sensation_distribution_sympto.csv'),
          row.names = FALSE)

save(vaginal_sensation_distribution_sympto, file = paste0(IO$output_Rdata, 'vaginal_sensation_distribution_sympto.Rdata'))

rm(res, resC, vaginal_sensation_distribution_sympto, N, c, n.days.in.cycle, i.start, i.end, h, cc, obs, breaks)

5 Ovulation Estimation: Distribution of cycle phase duration

5.1 Kindara

source("Scripts/_setup_Kindara.R")
cycle.file = paste0(IO$kindara_data_cycles,"06 CYCLES with reliable ovulation estimation.Rdata") 
file = paste0(IO$kindara_data_cycles,"05 standard with ovu est CYCLES.Rdata")
load(file, verbose = TRUE)
CYCLES_all = CYCLES
CYCLES = CYCLES[which(CYCLES$reliable.ovu.est),]
save(CYCLES, file = cycle.file)
rm(CYCLES_all)
breaks = seq(0,63,by = 1)+0.5
mids = breaks[2:length(breaks)]-0.5

length.hist = hist.cycles.par(file = cycle.file, 
                 attr = 'length', 
                 breaks = breaks)
## ../../FAM-Restricted-Access-Repo/Data/Kindara/Cycles/06 CYCLES with reliable ovulation estimation.Rdata 
## 80663 
## 80356 
## 0.996
n.cycles = sum(length.hist)
length.hist.norm = length.hist/n.cycles

# ovulation time

ovu.hist = hist.cycles.par(file = cycle.file, 
                 attr = 'ovu', 
                 breaks = breaks)
## ../../FAM-Restricted-Access-Repo/Data/Kindara/Cycles/06 CYCLES with reliable ovulation estimation.Rdata 
## 80663 
## 80545 
## 0.999
ovu.hist.norm = ovu.hist/n.cycles

# luteal duration


lut.hist = hist.cycles.par(file = cycle.file, 
                 attr = 'luteal.duration', 
                 breaks = breaks)
## ../../FAM-Restricted-Access-Repo/Data/Kindara/Cycles/06 CYCLES with reliable ovulation estimation.Rdata 
## 80663 
## 80640 
## 1
lut.hist.norm = lut.hist/n.cycles

plot(mids, length.hist.norm, type = 'l', lwd = 3, col = viz_cols$dark.red, ylim = range(length.hist.norm,ovu.hist.norm,lut.hist.norm))
points(mids, ovu.hist.norm, type = 'l', lwd = 3, col = "black")
points(mids, lut.hist.norm, type = 'l', lwd = 3, col = viz_cols$dark.yellow)
abline(v = 28)

####
cycle_phases_duration_distribution_kindara = data.frame(duration = mids, 
                                            cycle_length = length.hist.norm,
                                            estimated_ovulation = ovu.hist.norm,
                                            estimated_luteal_phase = lut.hist.norm)

write.csv(cycle_phases_duration_distribution_kindara, 
          file = paste0(IO$output_csv, 'cycle_phases_duration_distribution_kindara.csv'),
          row.names = FALSE)

save(cycle_phases_duration_distribution_kindara, file = paste0(IO$output_Rdata, 'cycle_phases_duration_distribution_kindara.Rdata') )

5.2 Sympto

source("Scripts/_setup_Sympto.R")
cycle.file = paste0(IO$sympto_data_03_reliable_ovulation_estimation,"cycles.Rdata")
file = paste0(IO$sympto_data_02_standard_cycles_with_HMM_res,"cycles.Rdata") 
load(file, verbose = TRUE)
cycles = cycles[which(cycles$reliable.ovu.est),]
save(cycles, file = cycle.file)
load(cycle.file, verbose = TRUE)
## Loading objects:
##   cycles
at.y = seq(0,0.22,by = 0.02)
at.x = seq(0,65,by = 7)


x = cycles$cycle_length
h = plot_hist(x = x, bin.size = 1, 
              at.x = seq(0,65,by = 7), at.y = at.y,
              xlab = 'Cycle length',ylab = '% of cycles',
              col = 'tomato2', border = 'tomato3', bars = FALSE)

h.cycle_length = h

x = cycles$ovu
h = plot_hist(x = x, bin.size = 1, 
              at.x = seq(0,65,by = 7), at.y = at.y,
              xlab = 'Estimated ovulation day',ylab = '% of cycles',
              col = 'gray30', border = 'gray25', bars = FALSE)

h.ovu = h

x = cycles$luteal.duration
h = plot_hist(x = x, bin.size = 1, 
              at.x = seq(0,65,by = 7), at.y = at.y,
              xlab = 'Estimated luteal phase duration',ylab = '% of cycles',
              col = viz_cols$yellow, border = viz_cols$dark.yellow, bars = FALSE)

h.lut = h

breaks = seq(0,63,by = 1)+0.5
mids = breaks[2:length(breaks)]-0.5


cycle_phases_duration_distribution_sympto = data.frame(duration = mids)
m = match(cycle_phases_duration_distribution_sympto$duration, h.cycle_length$mids)
cycle_phases_duration_distribution_sympto$cycle_length = h.cycle_length$density[m]
m = match(cycle_phases_duration_distribution_sympto$duration, h.ovu$mids)
cycle_phases_duration_distribution_sympto$estimated_ovulation = h.ovu$density[m]
m = match(cycle_phases_duration_distribution_sympto$duration, h.lut$mids)
cycle_phases_duration_distribution_sympto$estimated_luteal_phase = h.lut$density[m]

# by goals

contra = which((cycles$reliable.ovu.est) & (cycles$goal_txt == 'Contraception'))
concep = which((cycles$reliable.ovu.est) & (cycles$goal_txt == 'Conception'))

x = cycles$cycle_length[contra]
h.cl.contra = plot_hist(x = x, bin.size = 1, 
                        at.x = seq(0,65,by = 7), at.y = at.y,
                        xlab = 'Cycle length',ylab = '% of cycles',
                        col = viz_cols$yellow, border = viz_cols$dark.yellow)

x = cycles$cycle_length[concep]
h.cl.concep = plot_hist(x = x, bin.size = 1, 
                        at.x = seq(0,65,by = 7), at.y = at.y,
                        xlab = 'Cycle length',ylab = '% of cycles',
                        col = viz_cols$yellow, border = viz_cols$dark.yellow)

x = cycles$ovu[contra]
h.ovu.contra = plot_hist(x = x, bin.size = 1, 
                         at.x = seq(0,65,by = 7), at.y = at.y,
                         xlab = 'Estimated ovulation day',ylab = '% of cycles',
                         col = viz_cols$yellow, border = viz_cols$dark.yellow)

x = cycles$ovu[concep]
h.ovu.concep = plot_hist(x = x, bin.size = 1, 
                         at.x = seq(0,65,by = 7), at.y = at.y,
                         xlab = 'Estimated ovulation day',ylab = '% of cycles',
                         col = viz_cols$yellow, border = viz_cols$dark.yellow)

x = cycles$luteal.duration[contra]
h.lut.contra = plot_hist(x = x, bin.size = 1, 
                         at.x = seq(0,65,by = 7), at.y = at.y,
                         xlab = 'Estimated luteal phase duration',ylab = '% of cycles',
                         col = viz_cols$yellow, border = viz_cols$dark.yellow)

x = cycles$luteal.duration[concep]
h.lut.concep = plot_hist(x = x, bin.size = 1, 
                         at.x = seq(0,65,by = 7), at.y = at.y,
                         xlab = 'Estimated luteal phase duration',ylab = '% of cycles',
                         col = viz_cols$yellow, border = viz_cols$dark.yellow)

m = match(cycle_phases_duration_distribution_sympto$duration, h.cl.contra$mids)
cycle_phases_duration_distribution_sympto$cycle_length_avoid_preg = h.cl.contra$density[m]
m = match(cycle_phases_duration_distribution_sympto$duration, h.cl.concep$mids)
cycle_phases_duration_distribution_sympto$cycle_length_seek_preg = h.cl.concep$density[m]

m = match(cycle_phases_duration_distribution_sympto$duration, h.ovu.contra$mids)
cycle_phases_duration_distribution_sympto$estimated_ovulation_avoid_preg = h.ovu.contra$density[m]
m = match(cycle_phases_duration_distribution_sympto$duration, h.ovu.concep$mids)
cycle_phases_duration_distribution_sympto$estimated_ovulation_seek_preg = h.ovu.concep$density[m]


m = match(cycle_phases_duration_distribution_sympto$duration, h.lut.contra$mids)
cycle_phases_duration_distribution_sympto$estimated_luteal_phase_avoid_preg = h.lut.contra$density[m]
m = match(cycle_phases_duration_distribution_sympto$duration, h.lut.concep$mids)
cycle_phases_duration_distribution_sympto$estimated_luteal_phase_seek_preg = h.lut.concep$density[m]



test = cycle_phases_duration_distribution_sympto
test[is.na(test)] = 0
cycle_phases_duration_distribution_sympto = test

write.csv(cycle_phases_duration_distribution_sympto, 
          file = paste0(IO$output_csv, 'cycle_phases_duration_distribution_sympto.csv'),
          row.names = FALSE)

save(cycle_phases_duration_distribution_sympto,file = paste0(IO$output_Rdata, 'cycle_phases_duration_distribution_sympto.Rdata') )

6 Estimation parameters: Temperature shift, uncertainty and confidence score

6.1 Kindara

source("Scripts/_setup_Kindara.R")
cycle.file = paste0(IO$kindara_data_cycles,"05 standard with ovu est CYCLES.Rdata") 
load(cycle.file, verbose = TRUE)
## Loading objects:
##   CYCLES
# DELTA T
########################

by = 0.1
mids = seq(0,2,by = by)
breaks = c(mids[1]-by/2, mids + by/2)


DT.hist = hist.cycles.par(file = cycle.file, 
                 attr = 'D.T', 
                 breaks = breaks,
                 filter = FALSE)
## ../../FAM-Restricted-Access-Repo/Data/Kindara/Cycles/05 standard with ovu est CYCLES.Rdata 
## 719182 
## 523206 
## 0.728
n.cycles = sum(DT.hist)

DT.hist.norm = DT.hist/n.cycles

temperature_shift_distribution_kindara = data.frame(temperature_shifts = mids, 
                                                        fraction_of_cycles = DT.hist.norm)

write.csv(temperature_shift_distribution_kindara, 
          file = paste0(IO$output_csv, 'temperature_shift_distribution_kindara.csv'),
          row.names = FALSE)


save(temperature_shift_distribution_kindara, 
          file = paste0(IO$output_Rdata, 'temperature_shift_distribution_kindara.Rdata'))


# SD on OVU ESTIMATION
########################

by = 0.1
mids = seq(0,10,by = by)
breaks = c(mids[1]-by/2, mids + by/2)

sd.hist = hist.cycles.par(file = cycle.file, 
                 attr = 'ovu.sd', 
                 breaks = breaks,
                 filter = FALSE)
## ../../FAM-Restricted-Access-Repo/Data/Kindara/Cycles/05 standard with ovu est CYCLES.Rdata 
## 719182 
## 522148 
## 0.726
n.cycles = sum(sd.hist)
sd.hist.norm = sd.hist/n.cycles

uncertainties_distribution_kindara = data.frame(uncertainties = mids, 
                                                    fraction_of_cycles = sd.hist.norm)

write.csv(uncertainties_distribution_kindara, 
          file = paste0(IO$output_csv, 'uncertainties_distribution_kindara.csv'),
          row.names = FALSE)


save(uncertainties_distribution_kindara, 
          file = paste0(IO$output_Rdata, 'uncertainties_distribution_kindara.Rdata'))


# CONFIDENCE
########################

by = 0.05
mids = seq(0,1,by = by)
breaks = c(mids[1]-by/2, mids + by/2)

confidence.hist = hist.cycles.par(file = cycle.file, 
                 attr = 'confidence', 
                 breaks = breaks,
                 filter = FALSE)
## ../../FAM-Restricted-Access-Repo/Data/Kindara/Cycles/05 standard with ovu est CYCLES.Rdata 
## 719182 
## 523425 
## 0.728
n.cycles = sum(sd.hist)
confidence.hist.norm = confidence.hist/n.cycles

confidence_scores_distribution_kindara = data.frame(confidence_score = mids, 
                                                fraction_of_cycles = confidence.hist.norm)

write.csv(confidence_scores_distribution_kindara, 
          file = paste0(IO$output_csv, 'confidence_scores_distribution_kindara.csv'),
          row.names = FALSE)

save(confidence_scores_distribution_kindara, 
          file = paste0(IO$output_Rdata, 'confidence_scores_distribution_kindara.Rdata'))



rm(by, mids, breaks, n.cycles, confidence.hist, confidence.hist.norm, DT.hist, DT.hist.norm, uncertainties_distribution_kindara, confidence_scores_distribution_kindara, sd.hist, sd.hist.norm, CYCLES, cycle.file)

6.2 Sympto

source("Scripts/_setup_Sympto.R")
file = paste0(IO$sympto_data_02_standard_cycles_with_HMM_res,"cycles.Rdata") 
load(file, verbose = TRUE)
## Loading objects:
##   cycles
# DELTA T
########################

x = cycles$D.T[cycles$D.T >= 0]
h.DT = plot_hist(x = x, bin.size = 0.05, 
          at.x = seq(0,1,by = 0.1), at.y = seq(0,0.25,by = 0.05),
          xlab = 'Temperature shifts',ylab = '% of cycles' )

temperature_shift_distribution_sympto = data.frame(temperature_shifts = h.DT$mids, 
                                                        fraction_of_cycles = h.DT$density)

write.csv(temperature_shift_distribution_sympto, 
          file = paste0(IO$output_csv, 'temperature_shift_distribution_sympto.csv'),
          row.names = FALSE)


# SD on OVU ESTIMATION
########################

x = cycles$ovu.sd[cycles$ovu.sd >= 0]
h.sd = plot_hist(x = x, bin.size = 0.05, 
          at.x = seq(0,4,by = 0.5), at.y = seq(0,0.1,by = 0.02),
          xlab = 'Uncertainty on ovu. estimates (in days)',ylab = '% of cycles' )

uncertainties_distribution_sympto = data.frame(uncertainties = h.sd$mids, 
                                                    fraction_of_cycles = h.sd$density)

write.csv(uncertainties_distribution_sympto, 
          file = paste0(IO$output_csv, 'uncertainties_distribution_sympto.csv'),
          row.names = FALSE)


# CONFIDENCE
########################

x = cycles$confidence[cycles$confidence >= 0]
h.c = plot_hist(x = x, bin.size = 0.05, 
          at.x = seq(0,1,by = 0.2), at.y = seq(0,0.4,by = 0.05),
          xlab = 'Confidence score on ovu. estimates',ylab = '% of cycles' )

confidence_scores_distribution_sympto = data.frame(confidence_score = h.c$mids, 
                                                fraction_of_cycles = h.c$density)

write.csv(confidence_scores_distribution_sympto, 
          file = paste0(IO$output_csv, 'confidence_scores_distribution_sympto.csv'),
          row.names = FALSE)

7 Estimation parameters: Temperature shift, uncertainty and confidence score

7.1 Kindara

source("Scripts/_setup_Kindara.R")
days.folder.postHMM = paste0(IO$kindara_data_days,"05 post HMM/")
days.file.list = paste0(days.folder.postHMM,list.files(days.folder.postHMM))

breaks = c(0, seq(18,45,by = 3), Inf)

registerDoParallel(par$n_cores)

tic()
state_probs = foreach(day.file = days.file.list,.combine = combine.sum.state.probs) %dopar% 
{compute.state.probs(file = day.file, 
                 fun = 'sum', 
                 breaks = breaks,
                 cycleday_lim = c(-40, 40))}
toc()
## 77.547 sec elapsed
write.csv(state_probs, 
          file = paste0(IO$output_csv, 'states_probabilities_by_cycle_length_kindara.csv'),
          row.names = FALSE)


save(state_probs, 
          file = paste0(IO$output_Rdata, 'states_probabilities_by_cycle_length_kindara.Rdata'))

7.2 Sympto

source("Scripts/_setup_Sympto.R")
load(paste0(IO$sympto_data_02_standard_cycles_with_HMM_res,"cycledays.Rdata"), verbose = TRUE)
load(paste0(IO$sympto_data_03_reliable_ovulation_estimation,"cycles.Rdata"), verbose = TRUE)

m = match(cycledays$cycle_id, cycles$cycle_id)
cycledays = cycledays[!is.na(m),]
save(cycledays, file = paste0(IO$sympto_data_03_reliable_ovulation_estimation,"cycledays.Rdata") )
load(paste0(IO$sympto_data_03_reliable_ovulation_estimation,"cycledays.Rdata"), verbose = TRUE)
## Loading objects:
##   cycledays
load(paste0(IO$sympto_data_03_reliable_ovulation_estimation,"cycles.Rdata"), verbose = TRUE)
## Loading objects:
##   cycles
breaks = c(11, seq(18,45,by = 3), 365)
cycle_length_bin = cut(cycledays$cycle_length, breaks = breaks)
cycledays$cycle_length_bin = cycle_length_bin
rm(cycle_length_bin)

j = (cycledays$cycleday_from_ovu >= -40) & (cycledays$cycleday_from_ovu <= 40)

agg = aggregate( x = cycledays[j,44:52], 
                 by =  list(cycle_length_bin = cycledays$cycle_length_bin[j], 
                            cycleday_from_ovu = cycledays$cycleday_from_ovu[j]),FUN =  sum)

agg_sum = aggregate(cycle_id ~ cycle_length_bin, cycledays[j,], function(x) length(unique(x)))


colnames(agg)[3:11] = paste0('state.',colnames(agg)[3:11])
agg$cycle_length_bin_num = as.numeric(agg$cycle_length_bin)


state_probs = reshape(agg, idvar =  c("cycle_length_bin","cycle_length_bin_num","cycleday_from_ovu"),
        times = c("hM" , "lM", "lE" , "hE", "O", "Rise" , "hP" , "Ep" , "lP"), timevar = "state",
        varying = list(3:11), v.names = "prob", direction = "long")

state_probs$state = factor(state_probs$state, levels = c("hM" , "lM", "lE" , "hE", "O", "Rise" , "hP" , "Ep" , "lP"))

j = which(state_probs$cycle_length_bin_num %in% range(state_probs$cycle_length_bin_num))
state_probs = state_probs[-j,]


state_probs$prob_sum = state_probs$prob
m = match(state_probs$cycle_length_bin, agg_sum$cycle_length_bin)
state_probs$n_cycles = agg_sum$cycle_id[m]
state_probs$prob = state_probs$prob/state_probs$n_cycles


write.csv(state_probs, 
          file = paste0(IO$output_csv, 'states_probabilities_by_cycle_length_sympto.csv'),
          row.names = FALSE)

save(state_probs, 
          file = paste0(IO$output_Rdata, 'states_probabilities_by_cycle_length_sympto.Rdata'))

g = ggplot(state_probs, aes(x = cycleday_from_ovu, y = prob, fill = state ))
g = g + geom_area(position = 'identity', alpha = 0.7) + 
  facet_grid(cycle_length_bin ~ ., scales = "free_y") +
  scale_fill_manual(values = hmm_par$cycle.states.colors) +
  xlab("Cycleday from estimated ovulation")+
  ylab("Probabilities * Nb of cycles")

g + theme_hc() + 
  theme(strip.text.y = element_text(angle = 0),
        strip.background = element_blank())